How to use iCellR for analyzing scRNA-seq data
Download sample data
# set your working directory
setwd("/your/download/directory")
# save the URL as an object
sample.file.url = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz"
# download the file
download.file(url = sample.file.url,
destfile = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
method = "auto")
# unzip the file.
untar("pbmc3k_filtered_gene_bc_matrices.tar.gz")
Load iCellR package
library(iCellR)
Read 10x data
my.data <- load10x("filtered_gene_bc_matrices/hg19/",gene.name = "geneSymbol")
Look at the first few lines and columns of the sample data. And count the rows and columns.
head(my.data)[1:5]
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
## MIR1302.10 0 0 0
## FAM138A 0 0 0
## OR4F5 0 0 0
## RP11.34P13.7 0 0 0
## RP11.34P13.8 0 0 0
## AL627309.1 0 0 0
## AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1
## MIR1302.10 0 0
## FAM138A 0 0
## OR4F5 0 0
## RP11.34P13.7 0 0
## RP11.34P13.8 0 0
## AL627309.1 0 0
dim(my.data)
## [1] 32738 2700
You can devide the sample into 3 equal parts. Assuming that you have three samples.
sample1 <- my.data[1:900]
sample2 <- my.data[901:1800]
sample3 <- my.data[1801:2700]
Merge samples
Conditions in iCellR are set in the header of the data and are separated by an underscore (_)
my.data <- data.aggregation(samples = c("sample1","sample2","sample3"),
condition.names = c("WT","KO","Ctrl"))
head(my.data)[1:5]
## WT_AAACATACAACCAC.1 WT_AAACATTGAGCTAC.1 WT_AAACATTGATCAGC.1
## A1BG 0 0 1
## A1BG.AS1 0 0 0
## A1CF 0 0 0
## A2M 0 0 0
## A2M.AS1 0 0 0
## A2ML1 0 0 0
## WT_AAACCGTGCTTCCG.1 WT_AAACCGTGTATGCG.1
## A1BG 0 0
## A1BG.AS1 0 0
## A1CF 0 0
## A2M 0 0
## A2M.AS1 0 0
## A2ML1 0 0
Make an iCellR object
my.obj <- make.obj(my.data)
Look at the object
my.obj
## [1] "An object of class iCellR version: 0.99.0"
## [2] "Raw/original data dimentions (rows,columns): 32738,2700"
## [3] "Data conditions in raw data: Ctrl,KO,WT (900,900,900)"
## [4] "Columns names: WT_AAACATACAACCAC.1,WT_AAACATTGAGCTAC.1,WT_AAACATTGATCAGC.1 ..."
## [5] "Row names: A1BG,A1BG.AS1,A1CF ..."
Run basic QC
my.obj <- qc.stats(my.obj,
which.data = "raw.data",
mito.genes = "defult",
s.phase.genes = s.phase,
g2m.phase.genes = g2m.phase)
stats.plot(my.obj,
plot.type = "all.in.one",
out.name = "UMI-plot",
interactive = FALSE,
cell.color = "slategray3",
cell.size = 1,
cell.transparency = 0.5,
box.color = "red",
box.line.col = "green")

# Scatter plots
stats.plot(my.obj, plot.type = "point.mito.umi", out.name = "mito-umi-plot",interactive = F)
stats.plot(my.obj, plot.type = "point.gene.umi", out.name = "gene-umi-plot",interactive = F)

Filtering cells
iCellR allows you to filter based on library sizes (UMIs), number of genes per cell, percent mitochondrial content, one or more genes, and cell ids.
my.obj <- cell.filter(my.obj,
min.mito = 0,
max.mito = 0.05,
min.genes = 200,
max.genes = 2400,
min.umis = 0,
max.umis = Inf)
## [1] "cells with min mito ratio of 0 and max mito ratio of 0.05 were filtered."
## [1] "cells with min genes of 200 and max genes of 2400 were filtered."
## [1] "No UMI number filter"
## [1] "No cell filter by provided gene/genes"
## [1] "No cell id filter"
## [1] "filters_set.txt file has beed generated and includes the filters set for this experiment."
This step is optional and is for having the same number of cells for each condition.
# optional
# my.obj <- down.sample(my.obj)
#[1] "From"
#[1] "Data conditions: Ctrl,KO,WT (877,877,883)"
#[1] "to"
#[1] "Data conditions: Ctrl,KO,WT (877,877,877)"
Normalizing data
You have a few options to normalize your data based on your study. You can also normalize your data using tools other than iCellR and import your data to iCellR. We recommend “ranked.glsf” normalization for most single cell studies. This normalization is great for fixing matrixes with lots of zeros and because it’s geometric it is great for fixing for batch effects, as long as all the data is aggregated into one file (to aggregate your data see “aggregating data” section above).
- Choose from the following methods:
- deseq (best for bulk RNA-Seq)
- ranked.deseq (best for scRNA-Seq)
- global.glsf (best for bulk RNA-Seq)
- ranked.glsf (best for scRNA-Seq)
- rpm (best for bulk RNA-Seq)
- spike.in
For spike.in normalization you need to provide the spike.in vlaues to normalize the data with.
my.obj <- norm.data(my.obj,
norm.method = "ranked.glsf",
top.rank = 500)
Scaling data
my.obj <- data.scale(my.obj)
Calculating gene stats
my.obj <- gene.stats(my.obj, which.data = "main.data")
Make gene model for clustering
It’s best to always to avoid global clustering and use a set of model genes. In bulk RNA-seq data it is very common to cluster the samples based on top 500 genes ranked by base mean, this is to reduce the noise. In scRNA-seq data, it’s great to do so as well.
make.gene.model(my.obj,
dispersion.limit = 1.5,
base.mean.rank = 500,
no.mito.model = T,
no.cell.cycle = T,
mark.mito = T,
interactive = F,
out.name = "gene.model")
## [1] "my_model_genes.txt file is generated, which can be used for clustering."

Run PCA
my.obj <- run.pca(my.obj,
clust.method = "gene.model",
gene.list = readLines("my_model_genes.txt"),
batch.norm = F)
# Re-define model genes (run this if you have real samples)
# find.dim.genes(my.obj, dims = 1:10, top.pos = 20, top.neg = 10)
# Second round PCA and batch correction (run this if you have real sample)
#my.obj <- run.pca(my.obj,
# clust.method = "gene.model",
# gene.list = readLines("my_model_PC_genes.txt"),
# batch.norm = T)
# Visualize standard deviations for PCs
opt.pcs.plot(my.obj)

Cluster the data
Here we cluster the first 10 PC dimensions of the data. You have the option of clustering your data based on the following methods.
- Clustering methods:
- ward.D
- ward.D2
- single
- complete
- average
- mcquitty
- median
- centroid
- kmeans
- Distance methods:
- euclidean
- maximum
- manhattan
- canberra
- binary
- minkowski
- Indexing methods
- kl
- ch
- hartigan
- ccc
- scott
- marriot
- trcovw
- tracew
- friedman
- rubin
- cindex
- db
- silhouette
- duda
- pseudot2
- beale
- ratkowsky
- ball
- ptbiserial
- gap
- frey
- mcclain
- gamma
- gplus
- tau
- dunn
- hubert
- sdindex
- dindex
- sdbw
- all
my.obj <- run.clustering(my.obj,
clust.method = "kmeans",
dist.method = "euclidean",
index.method = "silhouette",
max.clust = 25,
min.clust = 2,
dims = 1:10)
Dimentionality reduction
# tSNE
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
# UMAP
my.obj <- run.umap(my.obj, dims = 1:10, method = "umap-learn")
# Diffusion map
my.obj <- run.diffusion.map(my.obj, dims = 1:10)
Create avarge expression per cluster
my.obj <- clust.avg.exp(my.obj)
Data imputaion
my.obj <- run.impute(my.obj)
Save object
save(my.obj, file = "my.obj.Robj")
Visualizing clusters and conditions
# clusters (tSNE)
cluster.plot(my.obj,
plot.type = "tsne",
interactive = F)
# Conditions (tSNE)
cluster.plot(my.obj,
plot.type = "tsne",
col.by = "conditions",
interactive = F)

# clusters (UMAP)
cluster.plot(my.obj,
plot.type = "umap",
interactive = F)
# Conditions (UMAP)
cluster.plot(my.obj,
plot.type = "umap",
col.by = "conditions",
interactive = F)

# clusters (diffusion)
cluster.plot(my.obj,
plot.type = "diffusion",
interactive = F)
# Conditions (diffusion)
cluster.plot(my.obj,
plot.type = "diffusion",
col.by = "conditions",
interactive = F)

Cell frequencies in clusters and conditions
# bar
clust.cond.info(my.obj, plot.type = "bar", normalize = T)
# pie
clust.cond.info(my.obj, plot.type = "pie", normalize = T)
## [1] "clust_cond_freq_info.txt file has beed generated."
## [1] "clust_cond_freq_info.txt file has beed generated."

Find marker genes for clusters
marker.genes <- findMarkers(my.obj,
fold.change = 2,
padjval = 0.1)
head(marker.genes)
## baseMean baseSD AvExpInCluster AvExpInOtherClusters
## GZMK 0.41991651 1.6966460 2.4363833 0.11537810
## LAG3 0.04546080 0.2827279 0.2468928 0.01503939
## CD8A 0.21021541 0.7086838 1.0163009 0.08847574
## GZMH 0.47430404 1.9741941 2.2865498 0.20060827
## CCL5 2.62409548 6.5488179 12.3863881 1.14973789
## RP11.222K16.2 0.02626753 0.2084994 0.1087713 0.01380733
## foldChange log2FoldChange pval padj clusters
## GZMK 21.116513 4.400300 7.120249e-27 1.478164e-23 1
## LAG3 16.416407 4.037066 1.168625e-10 2.410873e-07 1
## CD8A 11.486775 3.521902 8.226752e-28 1.708696e-24 1
## GZMH 11.398084 3.510719 1.184154e-19 2.452383e-16 1
## CCL5 10.773228 3.429379 2.251589e-72 4.687808e-69 1
## RP11.222K16.2 7.877796 2.977792 1.436435e-05 2.920273e-02 1
## gene cluster_1 cluster_2 cluster_3 cluster_4
## GZMK GZMK 2.4363833 0.05291050 0.00000 0.075901225
## LAG3 LAG3 0.2468928 0.00000000 0.00000 0.007326283
## CD8A CD8A 1.0163009 0.03154194 0.00000 0.029272097
## GZMH GZMH 2.2865498 0.02629519 0.00000 0.034177635
## CCL5 CCL5 12.3863881 0.21033406 33.79822 0.250237643
## RP11.222K16.2 RP11.222K16.2 0.1087713 0.00000000 0.00000 0.000000000
## cluster_5 cluster_6 cluster_7
## GZMK 0.40592901 0.127154863 0.027465519
## LAG3 0.01062049 0.026494853 0.000000000
## CD8A 0.11671291 0.141884155 0.017431855
## GZMH 2.63214509 0.016212854 0.027473883
## CCL5 8.84515165 0.602894477 0.122993436
## RP11.222K16.2 0.13921096 0.008496299 0.003296331
Visualizing gene expressions
MyGenes <- unique(top.markers(marker.genes, topde = 10, min.base.mean = 0.2))
MyGenes
## [1] "GZMK" "CD8A" "GZMH" "CCL5" "KLRG1"
## [6] "CD8B" "LYAR" "CST7" "GZMA" "NKG7"
## [11] "IGLL5" "LINC00926" "CD79A" "TCL1A" "MS4A1"
## [16] "CD79B" "HLA.DQB1" "HLA.DQA1" "HLA.DQA2" "HVCN1"
## [21] "PPBP" "PF4" "GPX1" "TAGLN2" "CALM3"
## [26] "OAZ1" "ACTB" "MYL6" "ITM2B" "S100A8"
## [31] "S100A9" "LGALS2" "CD14" "MS4A6A" "LYZ"
## [36] "FCN1" "GRN" "CDA" "BLVRB" "GNLY"
## [41] "GZMB" "SPON2" "FGFBP2" "PRF1" "CCL4"
## [46] "CD247" "LEF1" "AQP3" "CCR7" "PRKCQ.AS1"
## [51] "TRAT1" "FLT3LG" "PIK3IP1" "IL7R" "LDHB"
## [56] "TCF7" "MS4A7" "FCGR3A" "IFITM3" "LST1"
## [61] "HCK" "RHOC" "SERPINA1" "IFI30" "AIF1"
## [66] "FCER1G"
genelist = c("PPBP","LYZ","MS4A1","GNLY","LTB","NKG7","IFITM2","CD14","S100A9")
for(i in genelist){
MyPlot <- gene.plot(my.obj, gene = i,
interactive = F,
plot.data.type = "umap",
cell.transparency = 1)
i <- gsub("-",".",i)
eval(call("<-", as.name(i), MyPlot))
}
##
library(gridExtra)
grid.arrange(PPBP,LYZ,MS4A1,GNLY,LTB,NKG7,IFITM2,CD14,S100A9)

genelist = c("PPBP","LYZ","MS4A1","GNLY","LTB","NKG7","IFITM2","CD14","S100A9")
for(i in genelist){
MyPlot <- gene.plot(my.obj, gene = i,
interactive = F,
plot.data.type = "tsne",
cell.transparency = 1)
i <- gsub("-",".",i)
eval(call("<-", as.name(i), MyPlot))
}
##
library(gridExtra)
grid.arrange(PPBP,LYZ,MS4A1,GNLY,LTB,NKG7,IFITM2,CD14,S100A9)

QC on clusters
clust.stats.plot(my.obj, plot.type = "box.mito", interactive = F)
clust.stats.plot(my.obj, plot.type = "box.gene", interactive = F)

Cell type prediction using ImmGen
## get marker genes for cluster 2
Cluster = 2
MyGenes <- unique(top.markers(marker.genes, topde = 40, min.base.mean = 0.2, cluster = Cluster))
MyGenes
## [1] "IGLL5" "LINC00926" "CD79A" "TCL1A" "MS4A1"
## [6] "CD79B" "HLA.DQB1" "HLA.DQA1" "HLA.DQA2" "HVCN1"
## [11] "CD74" "HLA.DMB" "HLA.DRA" "HLA.DMA" "HLA.DPB1"
## [16] "HLA.DRB1" "HLA.DRB5" "HLA.DPA1" "SNX2" "CD37"
## [21] "LY86" "NCF1" "FAIM3" "SNHG7" "ISG20"
## [26] "FAM26F" "CXCR4" "SEC62" "RCSD1" "TMEM243"
## [31] "POLD4" "LAPTM5" "DCK" "EZR" "PAPOLA"
## [36] "POU2F2" "RNASEH2B"
# ImmGen dot plot
imm.gen(immgen.data = "rna", gene = MyGenes, plot.type = "point.plot")
imm.gen(immgen.data = "uli.rna", gene = MyGenes, plot.type = "point.plot", top.cell.types = 50)
As seen in the dot plots the B cells are in the top, suggesting that cluster 2 is B cells.

# ImmGen dot plot
imm.gen(immgen.data = "rna", gene = MyGenes, plot.type = "heatmap")
imm.gen(immgen.data = "uli.rna", gene = MyGenes, plot.type = "heatmap")
As seen in the heatmaps the B cells are more expressed for these genes.


Differential Expression Analysis
# Between clusters
diff.res <- diff.exp(my.obj, de.by = "clusters", cond.1 = c(1,4), cond.2 = c(2))
diff.res1 <- as.data.frame(diff.res)
diff.res1 <- subset(diff.res1, padj < 0.05)
head(diff.res1)
## baseMean 1_4 2 foldChange
## ABI3 0.21809887 2.839549e-01 0.054036489 0.19029955
## AC006369.2 0.02330890 3.266529e-02 0.000000000 0.00000000
## AC079767.4 0.05360692 9.323288e-04 0.184831350 198.24696396
## AC092580.4 0.06657490 9.184304e-02 0.003626185 0.03948241
## AC093673.5 0.04228509 5.724388e-02 0.005019326 0.08768318
## ACTB 17.26037308 2.050377e+01 9.180343474 0.44773940
## log2FoldChange pval padj
## ABI3 -2.393656 9.582075e-10 1.381160e-05
## AC006369.2 -Inf 2.449500e-06 3.457715e-02
## AC079767.4 7.631155 1.004322e-09 1.447328e-05
## AC092580.4 -4.662646 1.290914e-10 1.868082e-06
## AC093673.5 -3.511556 1.230512e-07 1.755818e-03
## ACTB -1.159269 4.796900e-94 7.126275e-90
## Between conditions
# diff.res <- diff.exp(my.obj, de.by = "conditions", cond.1 = c("WT"), cond.2 = c("KO"))
## Betwen conditions in the same cluster
# diff.res <- diff.exp(my.obj, de.by = "clustBase.condComp", cond.1 = c("WT"), cond.2 = c("KO"), base.cond = 1)
## Between clusters in the same condition
# diff.res <- diff.exp(my.obj, de.by = "condBase.clustComp", cond.1 = c(1), cond.2 = c(2), base.cond = "WT")
Volcano and MA plots
# Volcano Plot
volcano.ma.plot(diff.res,
sig.value = "pval",
sig.line = 0.05,
plot.type = "volcano",
interactive = F)
# MA Plot
volcano.ma.plot(diff.res,
sig.value = "pval",
sig.line = 0.05,
plot.type = "ma",
interactive = F)

Data manipulation
# let's say you want to merge cluster 3 and 2.
my.obj <- change.clust(my.obj, change.clust = 3, to.clust = 2)
# to reset to the original clusters run this.
my.obj <- change.clust(my.obj, clust.reset = T)
# you can also re-name the cluster numbers to cell types. Remember to reset after this so you can ran other analysis.
my.obj <- change.clust(my.obj, change.clust = 7, to.clust = "B Cell")
# Let's say for what ever reason you want to remove acluster, to do so run this.
my.obj <- clust.rm(my.obj, clust.to.rm = 1)
Cell gating
# first make a plot to gate
my.plot <- gene.plot(my.obj, gene = "GNLY",
plot.type = "scatterplot",
clust.dim = 2,
interactive = F)
# gate and download cell ids.
cell.gating(my.obj, my.plot = my.plot)
# once the cell ids are download (cellGating.txt), you can assign a cluster number to them.
my.obj <- gate.to.clust(my.obj, my.gate = "cellGating.txt", to.clust = 10)


Pseudotime analysis
MyGenes <- unique(top.markers(marker.genes, topde = 10, min.base.mean = 0.2))
#
pseudotime.tree(my.obj,
marker.genes = MyGenes,
type = "unrooted",
clust.method = "complete")
#
pseudotime.tree(my.obj,
marker.genes = MyGenes,
type = "jitter",
clust.method = "complete")


CITE-Seq
Read RNA data
RNA.data <- read.delim("CITE_RNA.tsv",header=TRUE)
rownames(RNA.data) <- RNA.data$gene
RNA.data <- RNA.data[,-1]
# if you had multiple data you can marge them like this
#RNA.data <- data.aggregation(samples = c("sample1","sample2"), condition.names = c("KO","WT"))
Read ADT data
adt.data <- read.delim("CITE_ADT.tsv",header=TRUE)
rownames(adt.data) <- adt.data$gene
adt.data <- adt.data[,-1]
# if you had multiple data you can marge them like this
#adt.data <- data.aggregation(samples = c("sample1","sample2"), condition.names = c("KO","WT"))
Make iCellR object
my.obj <- make.obj(RNA.data)
my.obj <- add.adt(my.obj, adt.data = adt.data)
# look at your object
my.obj
## [1] "An object of class iCellR version: 0.99.0"
## [2] "Raw/original data dimentions (rows,columns): 20501,8617"
## [3] "Data conditions: no conditions/single sample"
## [4] "Columns names: CTGTTTACACCGCTAG,CTCTACGGTGTGGCTC,AGCAGCCAGGCTCATT ..."
## [5] "Row names: A1BG,A1BG-AS1,A1CF ..."
# look at ADTs
head(my.obj@adt.raw)[1:5]
## CTGTTTACACCGCTAG CTCTACGGTGTGGCTC AGCAGCCAGGCTCATT
## ADT_CD3 60 52 89
## ADT_CD4 72 49 112
## ADT_CD8 76 59 61
## ADT_CD45RA 575 3943 682
## ADT_CD56 64 68 87
## ADT_CD16 161 107 117
## GAATAAGAGATCCCAT GTGCATAGTCATGCAT
## ADT_CD3 55 63
## ADT_CD4 66 80
## ADT_CD8 56 94
## ADT_CD45RA 378 644
## ADT_CD56 58 104
## ADT_CD16 82 168
Perform RNA data analysis as above
my.obj <- qc.stats(my.obj,
which.data = "raw.data",
mito.genes = "defult",
s.phase.genes = s.phase,
g2m.phase.genes = g2m.phase)
#
stats.plot(my.obj,
plot.type = "all.in.one",
out.name = "UMI-plot",
interactive = FALSE,
cell.color = "slategray3",
cell.size = 1,
cell.transparency = 0.5,
box.color = "red",
box.line.col = "green")
#
my.obj <- cell.filter(my.obj,
min.mito = 0,
max.mito = 0.80,
min.genes = 200,
max.genes = 4000,
min.umis = 0,
max.umis = Inf)
#
my.obj <- norm.data(my.obj, norm.method = "ranked.glsf", top.rank = 500)
#
my.obj <- gene.stats(my.obj, which.data = "main.data")
#
make.gene.model(my.obj,
dispersion.limit = 1.5,
base.mean.rank = 500,
no.mito.model = T,
mark.mito = T,
interactive = F,
out.name = "gene.model")
Normalize and merge ADTs with the main data to include them in the analysis
# my.obj <- norm.adt(my.obj) (fix this)
my.obj <- adt.rna.merge(my.obj)
my.obj <- run.pca(my.obj, clust.method = "gene.model", gene.list = readLines("my_model_genes.txt"), batch.norm = F)
#
my.obj <- run.clustering(my.obj,
clust.method = "ward.D",
dist.method = "euclidean",
index.method = "kl",
max.clust = 25,
min.clust = 2,
dims = 1:10)
#
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
#
my.obj <- run.umap(my.obj, dims = 1:10, method = "naive")
#
my.obj <- clust.avg.exp(my.obj)
#
save(my.obj, file = "my.CITE.obj.Robj")
Visualize ADTs and genes
genelist = c("ADT_CD3","CD3E","ADT_CD11c","ITGAX","ADT_CD16","FCGR3A")
for(i in genelist){
MyPlot <- gene.plot(my.obj, gene = i,
interactive = F,
plot.data.type = "tsne",
cell.transparency = 1)
i <- gsub("-",".",i)
eval(call("<-", as.name(i), MyPlot))
}
##
library(gridExtra)
grid.arrange(ADT_CD3,CD3E,ADT_CD11c,ITGAX,ADT_CD16,FCGR3A, ncol = 2)

The rest of the analysis is as scRNA-Seq
Large Scale Bulk RNA-Seq
Read data
This data is already normalized
tcga.data <- read.delim("TCGA.BRCA.tsv",header=TRUE)
rownames(tcga.data) <- tcga.data$gene
tcga.data <- tcga.data[,-1]
Make iCellR object
my.obj <- make.obj(tcga.data)
my.obj@main.data <- my.obj@raw.data
#
my.obj <- gene.stats(my.obj, which.data = "main.data")
#
make.gene.model(my.obj,
dispersion.limit = 1000,
base.mean.rank = 500,
no.mito.model = T,
mark.mito = T,
interactive = F,
out.name = "gene.model")
#
my.obj <- run.pca(my.obj,
clust.method = "gene.model",
gene.list = readLines("my_model_genes.txt"),
batch.norm = F)
#
my.obj <- run.clustering(my.obj,
clust.method = "kmeans",
dist.method = "euclidean",
index.method = "silhouette",
max.clust = 25,
min.clust = 2,
dims = 1:10)
#
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
#
my.obj <- run.umap(my.obj, dims = 1:10, method = "naive")
#
save(my.obj, file = "my.TCGA.obj.Robj")
visualize conditions
cluster.plot(my.obj,
plot.type = "tsne",
col.by = "conditions",
cell.size = 2,
cell.transparency = 1,
interactive = F)
#
cluster.plot(my.obj,
plot.type = "umap",
col.by = "conditions",
cell.size = 2,
cell.transparency = 1,
interactive = F)
